Loding important packages
library(readr, warn.conflicts=F)
library(RColorBrewer, warn.conflicts=F) #Rcolorbrewer palette
library(corrplot, warn.conflicts=F)
## corrplot 0.84 loaded
library(ggcorrplot, warn.conflicts=F)
## Loading required package: ggplot2
library(plotly, warn.conflicts=F)
library(ggplot2, warn.conflicts=F)
library(reshape, warn.conflicts=F)
library(viridis, warn.conflicts=F)
## Loading required package: viridisLite
library(tidyverse, warn.conflicts=F)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 3.0.5 ✓ dplyr 1.0.5
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ purrr 0.3.4 ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x tidyr::expand() masks reshape::expand()
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::rename() masks reshape::rename(), plotly::rename()
library(hrbrthemes, warn.conflicts=F)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(psych, warn.conflicts=F)
library(class, warn.conflicts=F)
library(caret, warn.conflicts = F)
## Loading required package: lattice
library(DescTools)
##
## Attaching package: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following objects are masked from 'package:psych':
##
## AUC, ICC, SD
library(sjPlot)
## Install package "strengejacke" from GitHub (`devtools::install_github("strengejacke/strengejacke")`) to load all sj-packages at once!
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:psych':
##
## alpha
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(caret)
library(Matrix)
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## The following object is masked from 'package:reshape':
##
## expand
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
set.seed(123456789)
The data was created by Dr. William H. Wolberg, W. Nick Street, and Olvi L. Mangasarian and then uploaded to Kaggle by Street in 1995. The data set describes characteristics of the cell nuclei of breast mass. The nuclei are seen through digitalized images from a fine needle aspirate (FNA) of a breast mass. FNA is a type of biopsy performed to gain samples of tissue and fluid from breast lesions using a twenty-one to twenty-five gauge needle. It is one of the ways doctors choose to diagnosis breast cancer without removing the mass first.
text_tbl <- data.frame(
Variable = c("Diagnosis", "Radius", "Texture", "Perimeter", "Area", "Smoothness", "Compactness", "Concavity", "Concave Points", "Symmetry", "Fractural Dimension"),
Description = c(
"M for malignant or B for benign",
"The mean of three measured distances from center to perimeter",
"The standard deviation of gray-scale values",
"The measure of the distance around the boundary of the nuclei",
"The measure of the surface of the nuclei",
"The variation in radius lengths",
"The measure equal to the perimeter squared divided by the area all minus one",
"The severity of concave portions on the contour",
"The number of concave portions of the contour",
"The measure of the likeness across any diameter of the nuclei",
"The measure of the “coastline approximation” minus one"
) )
kbl(text_tbl, booktabs = T) %>% kable_styling(full_width = F) %>% column_spec(1, bold = T) %>% column_spec(2, width = "30em")
| Variable | Description |
|---|---|
| Diagnosis | M for malignant or B for benign |
| Radius | The mean of three measured distances from center to perimeter |
| Texture | The standard deviation of gray-scale values |
| Perimeter | The measure of the distance around the boundary of the nuclei |
| Area | The measure of the surface of the nuclei |
| Smoothness | The variation in radius lengths |
| Compactness | The measure equal to the perimeter squared divided by the area all minus one |
| Concavity | The severity of concave portions on the contour |
| Concave Points | The number of concave portions of the contour |
| Symmetry | The measure of the likeness across any diameter of the nuclei |
| Fractural Dimension | The measure of the “coastline approximation” minus one |
Importing Dataset
data <- read_csv("~/Downloads/data.csv")
## Warning: Missing column names filled in: 'X33' [33]
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## .default = col_double(),
## diagnosis = col_character(),
## X33 = col_character()
## )
## ℹ Use `spec()` for the full column specifications.
## Warning: 569 parsing failures.
## row col expected actual file
## 1 -- 33 columns 32 columns '~/Downloads/data.csv'
## 2 -- 33 columns 32 columns '~/Downloads/data.csv'
## 3 -- 33 columns 32 columns '~/Downloads/data.csv'
## 4 -- 33 columns 32 columns '~/Downloads/data.csv'
## 5 -- 33 columns 32 columns '~/Downloads/data.csv'
## ... ... .......... .......... ......................
## See problems(...) for more details.
data
## # A tibble: 569 x 33
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 8.42e5 M 18.0 10.4 123. 1001
## 2 8.43e5 M 20.6 17.8 133. 1326
## 3 8.43e7 M 19.7 21.2 130 1203
## 4 8.43e7 M 11.4 20.4 77.6 386.
## 5 8.44e7 M 20.3 14.3 135. 1297
## 6 8.44e5 M 12.4 15.7 82.6 477.
## 7 8.44e5 M 18.2 20.0 120. 1040
## 8 8.45e7 M 13.7 20.8 90.2 578.
## 9 8.45e5 M 13 21.8 87.5 520.
## 10 8.45e7 M 12.5 24.0 84.0 476.
## # … with 559 more rows, and 27 more variables: smoothness_mean <dbl>,
## # compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## # symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## # texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## # texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## # smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## # `concave points_worst` <dbl>, symmetry_worst <dbl>,
## # fractal_dimension_worst <dbl>, X33 <chr>
Looking at dataset
head(data)
## # A tibble: 6 x 33
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 8.42e5 M 18.0 10.4 123. 1001
## 2 8.43e5 M 20.6 17.8 133. 1326
## 3 8.43e7 M 19.7 21.2 130 1203
## 4 8.43e7 M 11.4 20.4 77.6 386.
## 5 8.44e7 M 20.3 14.3 135. 1297
## 6 8.44e5 M 12.4 15.7 82.6 477.
## # … with 27 more variables: smoothness_mean <dbl>, compactness_mean <dbl>,
## # concavity_mean <dbl>, `concave points_mean` <dbl>, symmetry_mean <dbl>,
## # fractal_dimension_mean <dbl>, radius_se <dbl>, texture_se <dbl>,
## # perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## # texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## # smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## # `concave points_worst` <dbl>, symmetry_worst <dbl>,
## # fractal_dimension_worst <dbl>, X33 <chr>
Columns in dataset
colnames(data)
## [1] "id" "diagnosis"
## [3] "radius_mean" "texture_mean"
## [5] "perimeter_mean" "area_mean"
## [7] "smoothness_mean" "compactness_mean"
## [9] "concavity_mean" "concave points_mean"
## [11] "symmetry_mean" "fractal_dimension_mean"
## [13] "radius_se" "texture_se"
## [15] "perimeter_se" "area_se"
## [17] "smoothness_se" "compactness_se"
## [19] "concavity_se" "concave points_se"
## [21] "symmetry_se" "fractal_dimension_se"
## [23] "radius_worst" "texture_worst"
## [25] "perimeter_worst" "area_worst"
## [27] "smoothness_worst" "compactness_worst"
## [29] "concavity_worst" "concave points_worst"
## [31] "symmetry_worst" "fractal_dimension_worst"
## [33] "X33"
Checking for null values
##lapply(data,function(x) { length(which(is.na(x)))})
skimr::skim(data) ##Among 2 character variables, no missing values found for diagnosis variable, found 569 missing values found for X33 character variable and found 31 numeric variable
| Name | data |
| Number of rows | 569 |
| Number of columns | 33 |
| _______________________ | |
| Column type frequency: | |
| character | 2 |
| numeric | 31 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| diagnosis | 0 | 1 | 1 | 1 | 0 | 2 | 0 |
| X33 | 569 | 0 | NA | NA | 0 | 0 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| id | 0 | 1 | 30371831.43 | 125020585.61 | 8670.00 | 869218.00 | 906024.00 | 8813129.00 | 911320502.00 | ▇▁▁▁▁ |
| radius_mean | 0 | 1 | 14.13 | 3.52 | 6.98 | 11.70 | 13.37 | 15.78 | 28.11 | ▂▇▃▁▁ |
| texture_mean | 0 | 1 | 19.29 | 4.30 | 9.71 | 16.17 | 18.84 | 21.80 | 39.28 | ▃▇▃▁▁ |
| perimeter_mean | 0 | 1 | 91.97 | 24.30 | 43.79 | 75.17 | 86.24 | 104.10 | 188.50 | ▃▇▃▁▁ |
| area_mean | 0 | 1 | 654.89 | 351.91 | 143.50 | 420.30 | 551.10 | 782.70 | 2501.00 | ▇▃▂▁▁ |
| smoothness_mean | 0 | 1 | 0.10 | 0.01 | 0.05 | 0.09 | 0.10 | 0.11 | 0.16 | ▁▇▇▁▁ |
| compactness_mean | 0 | 1 | 0.10 | 0.05 | 0.02 | 0.06 | 0.09 | 0.13 | 0.35 | ▇▇▂▁▁ |
| concavity_mean | 0 | 1 | 0.09 | 0.08 | 0.00 | 0.03 | 0.06 | 0.13 | 0.43 | ▇▃▂▁▁ |
| concave points_mean | 0 | 1 | 0.05 | 0.04 | 0.00 | 0.02 | 0.03 | 0.07 | 0.20 | ▇▃▂▁▁ |
| symmetry_mean | 0 | 1 | 0.18 | 0.03 | 0.11 | 0.16 | 0.18 | 0.20 | 0.30 | ▁▇▅▁▁ |
| fractal_dimension_mean | 0 | 1 | 0.06 | 0.01 | 0.05 | 0.06 | 0.06 | 0.07 | 0.10 | ▆▇▂▁▁ |
| radius_se | 0 | 1 | 0.41 | 0.28 | 0.11 | 0.23 | 0.32 | 0.48 | 2.87 | ▇▁▁▁▁ |
| texture_se | 0 | 1 | 1.22 | 0.55 | 0.36 | 0.83 | 1.11 | 1.47 | 4.88 | ▇▅▁▁▁ |
| perimeter_se | 0 | 1 | 2.87 | 2.02 | 0.76 | 1.61 | 2.29 | 3.36 | 21.98 | ▇▁▁▁▁ |
| area_se | 0 | 1 | 40.34 | 45.49 | 6.80 | 17.85 | 24.53 | 45.19 | 542.20 | ▇▁▁▁▁ |
| smoothness_se | 0 | 1 | 0.01 | 0.00 | 0.00 | 0.01 | 0.01 | 0.01 | 0.03 | ▇▃▁▁▁ |
| compactness_se | 0 | 1 | 0.03 | 0.02 | 0.00 | 0.01 | 0.02 | 0.03 | 0.14 | ▇▃▁▁▁ |
| concavity_se | 0 | 1 | 0.03 | 0.03 | 0.00 | 0.02 | 0.03 | 0.04 | 0.40 | ▇▁▁▁▁ |
| concave points_se | 0 | 1 | 0.01 | 0.01 | 0.00 | 0.01 | 0.01 | 0.01 | 0.05 | ▇▇▁▁▁ |
| symmetry_se | 0 | 1 | 0.02 | 0.01 | 0.01 | 0.02 | 0.02 | 0.02 | 0.08 | ▇▃▁▁▁ |
| fractal_dimension_se | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.03 | ▇▁▁▁▁ |
| radius_worst | 0 | 1 | 16.27 | 4.83 | 7.93 | 13.01 | 14.97 | 18.79 | 36.04 | ▆▇▃▁▁ |
| texture_worst | 0 | 1 | 25.68 | 6.15 | 12.02 | 21.08 | 25.41 | 29.72 | 49.54 | ▃▇▆▁▁ |
| perimeter_worst | 0 | 1 | 107.26 | 33.60 | 50.41 | 84.11 | 97.66 | 125.40 | 251.20 | ▇▇▃▁▁ |
| area_worst | 0 | 1 | 880.58 | 569.36 | 185.20 | 515.30 | 686.50 | 1084.00 | 4254.00 | ▇▂▁▁▁ |
| smoothness_worst | 0 | 1 | 0.13 | 0.02 | 0.07 | 0.12 | 0.13 | 0.15 | 0.22 | ▂▇▇▂▁ |
| compactness_worst | 0 | 1 | 0.25 | 0.16 | 0.03 | 0.15 | 0.21 | 0.34 | 1.06 | ▇▅▁▁▁ |
| concavity_worst | 0 | 1 | 0.27 | 0.21 | 0.00 | 0.11 | 0.23 | 0.38 | 1.25 | ▇▅▂▁▁ |
| concave points_worst | 0 | 1 | 0.11 | 0.07 | 0.00 | 0.06 | 0.10 | 0.16 | 0.29 | ▅▇▅▃▁ |
| symmetry_worst | 0 | 1 | 0.29 | 0.06 | 0.16 | 0.25 | 0.28 | 0.32 | 0.66 | ▅▇▁▁▁ |
| fractal_dimension_worst | 0 | 1 | 0.08 | 0.02 | 0.06 | 0.07 | 0.08 | 0.09 | 0.21 | ▇▃▁▁▁ |
We can notice, that there seems to be three category in dataset. They’re: mean, se and worst
DATA WRANGLING Deleting X column as it seems to be a mistake while importing the dataset
drops <- c("X33")
data <- data[ , !(names(data) %in% drops)]
data
## # A tibble: 569 x 32
## id diagnosis radius_mean texture_mean perimeter_mean area_mean
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 8.42e5 M 18.0 10.4 123. 1001
## 2 8.43e5 M 20.6 17.8 133. 1326
## 3 8.43e7 M 19.7 21.2 130 1203
## 4 8.43e7 M 11.4 20.4 77.6 386.
## 5 8.44e7 M 20.3 14.3 135. 1297
## 6 8.44e5 M 12.4 15.7 82.6 477.
## 7 8.44e5 M 18.2 20.0 120. 1040
## 8 8.45e7 M 13.7 20.8 90.2 578.
## 9 8.45e5 M 13 21.8 87.5 520.
## 10 8.45e7 M 12.5 24.0 84.0 476.
## # … with 559 more rows, and 26 more variables: smoothness_mean <dbl>,
## # compactness_mean <dbl>, concavity_mean <dbl>, `concave points_mean` <dbl>,
## # symmetry_mean <dbl>, fractal_dimension_mean <dbl>, radius_se <dbl>,
## # texture_se <dbl>, perimeter_se <dbl>, area_se <dbl>, smoothness_se <dbl>,
## # compactness_se <dbl>, concavity_se <dbl>, `concave points_se` <dbl>,
## # symmetry_se <dbl>, fractal_dimension_se <dbl>, radius_worst <dbl>,
## # texture_worst <dbl>, perimeter_worst <dbl>, area_worst <dbl>,
## # smoothness_worst <dbl>, compactness_worst <dbl>, concavity_worst <dbl>,
## # `concave points_worst` <dbl>, symmetry_worst <dbl>,
## # fractal_dimension_worst <dbl>
Finally, we got rid of all the missing values, so the modified data is ready to use for further analysis.
Let’s looking into correlation matrix to see correlation between all the variables
matrixData <- cor(data[sapply(data,is.numeric)], method="pearson")
# Rcolorbrewer palette
coul <- colorRampPalette(brewer.pal(8, "PiYG"))(25)
heatmap(matrixData, scale="column", col = coul)
corrplot(matrixData, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)
#data <- sapply(data,is.numeric)
data.mean <- cor(data[,c(3:12)],method="pearson")
data.se <- cor(data[,c(13:22)],method="pearson")
data.worst <- cor(data[,c(23:32)],method="pearson")
corrplot(data.mean, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)
corrplot(data.se, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)
corrplot(data.worst, tl.col = "black", order = "hclust", hclust.method = "average", addrect = 4, tl.cex = 0.7)
table(data$diagnosis)
##
## B M
## 357 212
count(data, diagnosis) %>% mutate(relative_freq = (n/sum(n))) -> relative_freq
relative_freq
## # A tibble: 2 x 3
## diagnosis n relative_freq
## <chr> <int> <dbl>
## 1 B 357 0.627
## 2 M 212 0.373
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) +
geom_bar() +
scale_fill_brewer(palette = "Set1") +
theme(legend.position="none") + labs(title= "Barplot representing two different tumors")
pairs.panels(data[,c(3:12)], main="Cancer Mean")
pairs.panels(data[,c(13:22)], main="Cancer SE")
pairs.panels(data[,c(23:32)], main="Cancer Worst")
Now we will construct 9 different violin plots for radius, perimeter and area of the spread of tumor in the body of patient based on the mean, se and worst.
##Violin plot representing Radius Mean distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = radius_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Mean distribution by diagnosis")
##Violin plot representing Radius_Se distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = radius_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 2) +
labs(title = "Radius Se distribution by diagnosis")
##Violin plot representing Radius worst distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = radius_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Radius Worst distribution by diagnosis")
##Violin plot representing area Mean distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = area_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Mean distribution by diagnosis")
##Violin plot representing area se distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = area_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area_Se distribution by diagnosis")
##Violin plot representing area worst distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = area_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Area Worst distribution by diagnosis")
##Violin plot representing perimeter_Se distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = perimeter_se)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter_se distribution by diagnosis")
##Violin plot representing perimeter Mean distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = perimeter_mean)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "perimeter mean distribution by diagnosis")
##Violin plot representing perimeter worst distribution by diagnosis
ggplot(data, aes(x = diagnosis,
y = perimeter_worst)) + geom_violin(fill = "cornflowerblue") + geom_boxplot(width = .01,
fill = "orange", outlier.color = "orange", outlier.size = 3) +
labs(title = "Perimeter Worst distribution by diagnosis")
Let’s split the data now to see how tumors differ for M and B
cancer_split <- split(data, data$diagnosis)
##cancer_train <- training(cancer_split)
##cancer_test <- testing(cancer_split)
dataB <- cancer_split$B
dataM <- cancer_split$M
ggplot(data, aes(x=as.factor(diagnosis), fill=as.factor(diagnosis) )) +
geom_bar() + labs(title= "Barplot representing two different tumors")
Now we have two different datasets for B and M